Data preparation

In the examples below, we’ll be working with a random sample of 10,000 public Facebook posts by Members of the U.S. Congress. The overall question that we will be trying to answer is: what type of posts gets more likes?

library(DBI)
db <- dbConnect(RSQLite::SQLite(), "~/data/facebook-db.sqlite")

df <- dbGetQuery(db, 
                 "SELECT posts.screen_name, date, posts.type AS post_type, 
                      message, likes_count, comments_count, shares_count,
                      love_count, haha_count, wow_count, angry_count,
                      sad_count, gender, congress.type, party
                 FROM posts JOIN congress
                 ON congress.screen_name = posts.screen_name
                 ORDER BY RANDOM()
                 LIMIT 10000")
## Warning in rsqlite_fetch(res@ptr, n = n): Column `screen_name`: mixed type,
## first seen values of type string, coercing other values of type real
# also available as:
df <- read.csv("fb-congress-data.csv", stringsAsFactors=FALSE)

And now we load the ggplot2 package:

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4

Univariate analysis for a single continuous variable

# base layer
p <- ggplot(df, aes(x=likes_count))

# histogram
p + geom_histogram() ## histogram of likes for each post
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# density plot
p + geom_density() ## density of likes for each post

# transforming scale to log10
p + geom_histogram() + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 22 rows containing non-finite values (stat_bin).

p + geom_density() + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 22 rows containing non-finite values (stat_density).

# why does this line of code drops some observations?

Univariate analysis for a single categorical variable

p <- ggplot(df, aes(x=post_type))

# bar chart
p + geom_bar() ## number of posts by type

# bar chart (horizontal)
p + geom_bar() + coord_flip()

Bivariate analysis for two continuous variables

# base layer
p <- ggplot(df, aes(x=likes_count, y=comments_count))

# scatter plot
p + geom_point() ## relationship between number of likes and number of comments

p + geom_point() + scale_x_log10() + scale_y_log10() ## log scales
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

p + geom_point() + stat_smooth(na.rm=T) 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

p + geom_point() + scale_x_log10() + scale_y_log10() + 
    stat_smooth()
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 652 rows containing non-finite values (stat_smooth).

Another example, but this time with a line plot

counts <- dbGetQuery(db, 
                     "SELECT date, COUNT(1) as post_count
                     FROM posts 
                     GROUP BY date
                     ORDER BY date")

p <- ggplot(counts, aes(x=as.Date(date), y=post_count))
p + geom_line() ## line: posts per day

Bivariate analysis for one continuous variable and one categorical variable

p <- ggplot(df, aes(x=post_type, y=likes_count))
p + geom_boxplot() ## number of likes by type of post

p + geom_boxplot() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 22 rows containing non-finite values (stat_boxplot).

p + geom_violin() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 22 rows containing non-finite values (stat_ydensity).

p <- ggplot(df, aes(x=likes_count)) ## same with density plot
p + geom_density(aes(color=party)) + scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 22 rows containing non-finite values (stat_density).

Bivariate analysis for two categorical variables

counts <- dbGetQuery(db, 
                     "SELECT posts.type, congress.party, 
                        COUNT(1) AS post_count
                     FROM posts JOIN congress
                     ON congress.screen_name = posts.screen_name
                     WHERE party != 'Independent'
                     GROUP BY posts.type, congress.party")

p <- ggplot(counts, aes(x=party, y=type))
p + geom_tile(aes(fill=post_count))

Multivariate analysis for three continuous variables

p <- ggplot(df, aes(x=likes_count, y=comments_count, color=log(angry_count)))

p + geom_point()

p + geom_point() + scale_y_log10() + scale_x_log10() + 
  stat_smooth(method="lm")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 652 rows containing non-finite values (stat_smooth).

Multivariate analysis for two continuous variables and one categorical variable

p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_point() + scale_x_log10() + scale_y_log10() + 
        facet_wrap(~post_type, nrow=2) ## grid of plots: 2x4, by post type
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

p <- ggplot(df[df$likes_count>10000,], 
            aes(x=likes_count, y=comments_count, label=party))
p + geom_text() + scale_x_log10() + scale_y_log10()

        ## geom_text() to use party names instead of points

Another example, using time and counts by party:

## counting number of posts by party and day
counts <- dbGetQuery(db, 
                     "SELECT SUBSTR(posts.date, 1, 7) AS month, 
                        congress.party, 
                        COUNT(1) AS post_count
                     FROM posts JOIN congress
                     ON congress.screen_name = posts.screen_name
                     WHERE party != 'Independent'
                     GROUP BY month, congress.party")

p <- ggplot(counts, aes(x=as.Date(paste0(month, '-01')), 
                        y=post_count, group=party))
p + geom_line(aes(color=party)) +
  scale_color_manual(values=c("blue", "red"))

## line: posts per month, by party

Other examples:

## scatter plot with dots colored by type of post
p <- ggplot(df[df$likes_count>5000,], 
            aes(x=likes_count, y=comments_count))
p + geom_point(aes(color=post_type)) + scale_x_log10() + scale_y_log10()

## same for point shape
p <- ggplot(df[df$likes_count>5000,], 
            aes(x=likes_count, y=comments_count))
p + geom_point(aes(shape=post_type)) + scale_x_log10() + scale_y_log10()

## combining both
p <- ggplot(df[df$likes_count>5000,], 
            aes(x=likes_count, y=comments_count))
p + geom_point(aes(shape=post_type, color=post_type)) + scale_x_log10() + scale_y_log10()

## this can be very easily extended to multiple scales
p <- ggplot(df[df$likes_count>5000,], 
            aes(x=likes_count, y=comments_count))
p + geom_point(aes(shape=gender, color=post_type, size=shares_count)) + 
        scale_x_log10() + scale_y_log10() + scale_size(trans="log10") +
        facet_wrap(~post_type, nrow=2)   
## Warning: Transformation introduced infinite values in discrete y-axis
## Warning in sqrt(x): NaNs produced
## Warning: Removed 2 rows containing missing values (geom_point).

Dealing with overfitting issues

# baseline
p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_point() + scale_x_log10() + scale_y_log10() 
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis

## jittering points (useful for counts)
p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_jitter(position = position_jitter(width = .5, height=.5)) + 
    scale_x_log10() + scale_y_log10() 
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## transparency
p <- ggplot(df, aes(x=likes_count, y=comments_count))
p + geom_jitter(position = position_jitter(width = .5, height=.5), alpha=1/25) + 
    scale_x_log10() + scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous y-axis

## hexbin
p <- ggplot(df[df$likes_count>0 & df$comments_count>0,], 
        aes(x=likes_count, y=comments_count))
p + geom_hex() + scale_x_log10() + scale_y_log10() + 
    scale_fill_continuous(trans="log10")